library(MASS)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::select() masks MASS::select()
mydata<- MASS::Boston
#attach(Boston)
#mydata

**Number of observations and columns

nrow(mydata)
## [1] 506
ncol(mydata)
## [1] 14

** Structure of the data

str(mydata)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

** Top and bottom of the data

head(mydata)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
tail(mydata)
##        crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 501 0.22438  0  9.69    0 0.585 6.027 79.7 2.4982   6 391    19.2 396.90 14.33
## 502 0.06263  0 11.93    0 0.573 6.593 69.1 2.4786   1 273    21.0 391.99  9.67
## 503 0.04527  0 11.93    0 0.573 6.120 76.7 2.2875   1 273    21.0 396.90  9.08
## 504 0.06076  0 11.93    0 0.573 6.976 91.0 2.1675   1 273    21.0 396.90  5.64
## 505 0.10959  0 11.93    0 0.573 6.794 89.3 2.3889   1 273    21.0 393.45  6.48
## 506 0.04741  0 11.93    0 0.573 6.030 80.8 2.5050   1 273    21.0 396.90  7.88
##     medv
## 501 16.8
## 502 22.4
## 503 20.6
## 504 23.9
## 505 22.0
## 506 11.9

** Basic visualization

plot(crim ~ rad, data = mydata, col = "dodgerblue", pch = 20, cex = 1.5,
     main = "Boston: Crim vs Rad")

summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
library(corrplot)
## corrplot 0.84 loaded
corr <- cor(Boston)
corrplot(corr,type = "full")

** boxplot on a few variables

par(mfrow= c(2,2))
boxplot(mydata$rad, main="Index of Access to Radial Highways",col = "red")
boxplot(mydata$zn, main="Landzone",col = "yellow")
boxplot(mydata$medv, main="Median val of occupies homes",col="green")
boxplot(mydata$dis,main="Mean of distances between employment centres",col = "blue")

scatter plot

ggplot(Boston,aes(x=dis, y=crim))+ geom_point()+labs(title = 'Dis')

summary(mydata$crim)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00632  0.08204  0.25651  3.61352  3.67708 88.97620
require(ggplot2)
require(plotly)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(data = Boston, x = ~tax, y = ~crim)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
featurePlot(x= mydata[,c("dis","tax","zn","rad","black","lstat","ptratio","medv")],y= mydata$crim)

pairs(Boston)

library(ggcorrplot)
corr=round(cor(Boston),1)
ggcorrplot(corr,hc.order= TRUE,lab= TRUE)

#Model1
mod1 <- lm(crim~.,Boston)
par(mfrow=c(2,2),mar=c(4,4,2,0.5))
plot(mod1)

summary(mod1)
## 
## Call:
## lm(formula = crim ~ ., data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.924 -2.120 -0.353  1.019 75.051 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.033228   7.234903   2.354 0.018949 *  
## zn            0.044855   0.018734   2.394 0.017025 *  
## indus        -0.063855   0.083407  -0.766 0.444294    
## chas         -0.749134   1.180147  -0.635 0.525867    
## nox         -10.313535   5.275536  -1.955 0.051152 .  
## rm            0.430131   0.612830   0.702 0.483089    
## age           0.001452   0.017925   0.081 0.935488    
## dis          -0.987176   0.281817  -3.503 0.000502 ***
## rad           0.588209   0.088049   6.680 6.46e-11 ***
## tax          -0.003780   0.005156  -0.733 0.463793    
## ptratio      -0.271081   0.186450  -1.454 0.146611    
## black        -0.007538   0.003673  -2.052 0.040702 *  
## lstat         0.126211   0.075725   1.667 0.096208 .  
## medv         -0.198887   0.060516  -3.287 0.001087 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared:  0.454,  Adjusted R-squared:  0.4396 
## F-statistic: 31.47 on 13 and 492 DF,  p-value: < 2.2e-16
shapiro.test(mod1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  mod1$residuals
## W = 0.50435, p-value < 2.2e-16
#Model2
mod2 <- lm(crim~dis +rad +medv, mydata)
par(mfrow=c(2,2),mar=c(4,4,2,0.5))
plot(mod2)

summary(mod2)
## 
## Call:
## lm(formula = crim ~ dis + rad + medv, data = mydata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.010 -1.785 -0.502  0.869 75.223 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.49320    1.23555   2.827  0.00488 ** 
## dis         -0.32412    0.15992  -2.027  0.04321 *  
## rad          0.51528    0.04051  12.719  < 2e-16 ***
## medv        -0.15844    0.03443  -4.602  5.3e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.558 on 502 degrees of freedom
## Multiple R-squared:  0.4222, Adjusted R-squared:  0.4187 
## F-statistic: 122.3 on 3 and 502 DF,  p-value: < 2.2e-16
m1<-boxcox(crim~dis+rad+medv,data=mydata)

b1<- m1$x[which.max(m1$y)]
mydata$y<- ((mydata$crim)^b1-1/b1)
bcm1<-lm(y~dis + rad + medv,data= mydata)
summary(bcm1)
## 
## Call:
## lm(formula = y ~ dis + rad + medv, data = mydata)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.133706 -0.038786 -0.004835  0.032465  0.142020 
## 
## Coefficients:
##               Estimate Std. Error   t value Pr(>|t|)    
## (Intercept) -1.552e+01  9.622e-03 -1612.846  < 2e-16 ***
## dis         -1.905e-02  1.245e-03   -15.298  < 2e-16 ***
## rad          9.880e-03  3.155e-04    31.314  < 2e-16 ***
## medv        -1.795e-03  2.681e-04    -6.696 5.74e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05107 on 502 degrees of freedom
## Multiple R-squared:  0.8448, Adjusted R-squared:  0.8439 
## F-statistic: 910.7 on 3 and 502 DF,  p-value: < 2.2e-16
plot(bcm1)

shapiro.test(bcm1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  bcm1$residuals
## W = 0.98073, p-value = 3.077e-06

** Shapiro test

for (i in 1:ncol(mydata)){
  shapiro.test(mydata[,i])$p.value %>%print()}
## [1] 1.328589e-36
## [1] 7.88283e-34
## [1] 1.064699e-17
## [1] 2.350507e-40
## [1] 5.776224e-14
## [1] 2.411977e-10
## [1] 2.230765e-18
## [1] 2.18539e-17
## [1] 8.073213e-30
## [1] 1.162781e-23
## [1] 2.360649e-17
## [1] 6.05766e-36
## [1] 8.286632e-14
## [1] 4.941386e-16
## [1] 7.734451e-15
step.model <- stepAIC(mod1,direction = "backward",trace = TRUE)
## Start:  AIC=1898.56
## crim ~ zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + black + lstat + medv
## 
##           Df Sum of Sq   RSS    AIC
## - age      1      0.27 20400 1896.6
## - chas     1     16.71 20417 1897.0
## - rm       1     20.43 20420 1897.1
## - tax      1     22.29 20422 1897.1
## - indus    1     24.30 20424 1897.2
## <none>                 20400 1898.6
## - ptratio  1     87.65 20488 1898.7
## - lstat    1    115.18 20515 1899.4
## - nox      1    158.47 20558 1900.5
## - black    1    174.58 20574 1900.9
## - zn       1    237.70 20638 1902.4
## - medv     1    447.85 20848 1907.5
## - dis      1    508.77 20909 1909.0
## - rad      1   1850.44 22250 1940.5
## 
## Step:  AIC=1896.56
## crim ~ zn + indus + chas + nox + rm + dis + rad + tax + ptratio + 
##     black + lstat + medv
## 
##           Df Sum of Sq   RSS    AIC
## - chas     1     16.54 20417 1895.0
## - rm       1     22.14 20422 1895.1
## - tax      1     22.16 20422 1895.1
## - indus    1     24.30 20424 1895.2
## <none>                 20400 1896.6
## - ptratio  1     87.41 20488 1896.7
## - lstat    1    131.43 20532 1897.8
## - nox      1    166.37 20567 1898.7
## - black    1    174.40 20575 1898.9
## - zn       1    239.21 20639 1900.5
## - medv     1    447.81 20848 1905.5
## - dis      1    559.06 20959 1908.2
## - rad      1   1857.98 22258 1938.7
## 
## Step:  AIC=1894.97
## crim ~ zn + indus + nox + rm + dis + rad + tax + ptratio + black + 
##     lstat + medv
## 
##           Df Sum of Sq   RSS    AIC
## - tax      1     18.81 20436 1893.4
## - rm       1     22.76 20440 1893.5
## - indus    1     28.82 20446 1893.7
## <none>                 20417 1895.0
## - ptratio  1     84.57 20501 1895.1
## - lstat    1    129.63 20546 1896.2
## - nox      1    175.96 20593 1897.3
## - black    1    178.37 20595 1897.4
## - zn       1    241.26 20658 1898.9
## - medv     1    483.38 20900 1904.8
## - dis      1    563.37 20980 1906.8
## - rad      1   1842.82 22260 1936.7
## 
## Step:  AIC=1893.44
## crim ~ zn + indus + nox + rm + dis + rad + ptratio + black + 
##     lstat + medv
## 
##           Df Sum of Sq   RSS    AIC
## - rm       1      23.0 20459 1892.0
## - indus    1      64.4 20500 1893.0
## <none>                 20436 1893.4
## - ptratio  1      87.4 20523 1893.6
## - lstat    1     137.9 20574 1894.8
## - black    1     178.1 20614 1895.8
## - nox      1     181.9 20617 1895.9
## - zn       1     222.9 20658 1896.9
## - medv     1     465.3 20901 1902.8
## - dis      1     556.9 20992 1905.0
## - rad      1    4693.4 25129 1996.0
## 
## Step:  AIC=1892.01
## crim ~ zn + indus + nox + dis + rad + ptratio + black + lstat + 
##     medv
## 
##           Df Sum of Sq   RSS    AIC
## - indus    1      74.0 20533 1891.8
## <none>                 20459 1892.0
## - ptratio  1      88.2 20547 1892.2
## - lstat    1     118.9 20577 1892.9
## - nox      1     176.9 20636 1894.4
## - black    1     202.4 20661 1895.0
## - zn       1     233.9 20692 1895.8
## - medv     1     458.7 20917 1901.2
## - dis      1     572.2 21031 1904.0
## - rad      1    4811.3 25270 1996.9
## 
## Step:  AIC=1891.83
## crim ~ zn + nox + dis + rad + ptratio + black + lstat + medv
## 
##           Df Sum of Sq   RSS    AIC
## <none>                 20533 1891.8
## - lstat    1     104.7 20637 1892.4
## - ptratio  1     119.0 20652 1892.8
## - black    1     198.4 20731 1894.7
## - zn       1     239.6 20772 1895.7
## - nox      1     296.6 20829 1897.1
## - medv     1     430.2 20963 1900.3
## - dis      1     507.8 21040 1902.2
## - rad      1    4739.5 25272 1994.9
summary(step.model)
## 
## Call:
## lm(formula = crim ~ zn + nox + dis + rad + ptratio + black + 
##     lstat + medv, data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.860 -2.102 -0.363  0.895 75.702 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.683128   6.086010   3.234 0.001301 ** 
## zn            0.043293   0.017977   2.408 0.016394 *  
## nox         -12.753708   4.760157  -2.679 0.007623 ** 
## dis          -0.918318   0.261932  -3.506 0.000496 ***
## rad           0.532617   0.049727  10.711  < 2e-16 ***
## ptratio      -0.310541   0.182941  -1.697 0.090229 .  
## black        -0.007922   0.003615  -2.191 0.028897 *  
## lstat         0.110173   0.069219   1.592 0.112097    
## medv         -0.174207   0.053988  -3.227 0.001334 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.428 on 497 degrees of freedom
## Multiple R-squared:  0.4505, Adjusted R-squared:  0.4416 
## F-statistic: 50.92 on 8 and 497 DF,  p-value: < 2.2e-16
broom::glance(bcm1)%>% dplyr::select(AIC,BIC)
## # A tibble: 1 x 2
##      AIC    BIC
##    <dbl>  <dbl>
## 1 -1568. -1547.
test_data <- sample_n(mydata,5)
test_data
##      crim zn indus chas   nox    rm   age    dis rad tax ptratio  black lstat
## 1 0.16760  0  7.38    0 0.493 6.426  52.3 4.5404   5 287    19.6 396.90  7.20
## 2 0.88125  0 21.89    0 0.624 5.637  94.7 1.9799   4 437    21.2 396.90 18.34
## 3 0.13158  0 10.01    0 0.547 6.176  72.5 2.7301   6 432    17.8 393.30 12.04
## 4 1.35472  0  8.14    0 0.538 6.072 100.0 4.1750   4 307    21.0 376.73 13.04
## 5 0.06211 40  1.25    0 0.429 6.490  44.4 8.7921   1 335    19.7 396.90  5.98
##   medv         y
## 1 23.8 -15.60260
## 2 14.3 -15.50763
## 3 21.2 -15.61566
## 4 14.5 -15.48143
## 5 22.9 -15.65500
distPred <- predict(bcm1,test_data,interval = "confidence",level=0.9)
distPred <- (b1*distPred+1)**(1/b1)

actuals_preds <- data.frame(cbind(actuals=test_data$crim, predicteds=distPred))  
actuals_preds
##   actuals          fit          lwr          upr
## 1 0.16760 1.452904e-21 1.343790e-21 1.570299e-21
## 2 0.88125 3.927925e-21 3.421960e-21 4.503550e-21
## 3 0.13158 3.484378e-21 3.184992e-21 3.810051e-21
## 4 1.35472 1.866524e-21 1.664043e-21 2.091981e-21
## 5 0.06211 1.405020e-22 1.139081e-22 1.728488e-22